read Data
train <- read.csv("train.csv", stringsAsFactors = FALSE)
train <- train[,-1]
Delete the variables with too much NAs and impute Data 1. The missing Data for the BsmtQual, BsmtCond, BsmtExposure, BsmtFinType1, BsmtFinType2 are almost the same amounts. It can be estimated that the missing data in these variables are caused by same reason which might be the absence of the basement. 2. The MasVnrType and MasVnrArea are always NA together. That can caused by the fact that there is no masonry veneer which leads to NA.
dimension <- dim(train)
missingNum <- sapply(train, function(x) {sum(is.na(x))})
data <- train[, missingNum < 0.05 * dimension[1]]
missingNum <- sapply(data, function(x) {sum(is.na(x))})
data$BsmtExposure[which(is.na(data$BsmtExposure))] <- 'None'
data$BsmtFinType1[which(is.na(data$BsmtFinType1))] <- 'None'
data$BsmtFinType2[which(is.na(data$BsmtFinType2))] <- 'None'
data$BsmtQual[which(is.na(data$BsmtQual))] <- 'None'
data$BsmtCond[which(is.na(data$BsmtCond))] <- 'None'
missingNum <- sapply(data, function(x) {sum(is.na(x))})
After filling up the basement-related missing values, the other missing values should be imputed with MICE. The categorical variables “MasVnrType” and “Electrical” should be converted into factors before imputation of the data
library(mice)
data$MasVnrType[which(is.na(data$MasVnrArea))] <- "None"
data$MasVnrArea[which(is.na(data$MasVnrArea))] <- 0
data$Electrical <- as.factor(data$Electrical)
data_complete <- mice(data, m = 1, printFlag = FALSE)
data_complete <- complete(data_complete)
Make sure there are no missing values inside.
TotalMissingSum <- sum(sapply(data_complete, function(x) {sum(is.na(x))}))
Add the new features 1. Basement Square Feet(Type I and Type II) 2. 1st and 2nd Floor Square Feet 3. Wood Deck and Open Porch Square Feet 4. Basement Bathroom 5. Bathroom 6. Age of house (Year - YearBuilt) 7. Year of Last Remodel 8. High Quality Square Feet 9. Total Area
data_complete$BasementSF <- data_complete$BsmtFinSF1 + data_complete$BsmtFinSF2
data_complete$OneandTwoFloorSF <- data_complete$X1stFlrSF + data_complete$X2ndFlrSF
data_complete$FrontSF <- data_complete$WoodDeckSF + data_complete$OpenPorchSF
data_complete$BasementBath <- data_complete$BsmtFullBath + 0.5 * data_complete$BsmtHalfBath
data_complete$Bath <- data_complete$FullBath + 0.5 * data_complete$HalfBath
data_complete$Age <- data_complete$YrSold - data_complete$YearBuilt
data_complete$YrOfRemodel <- data_complete$YrSold - data_complete$YearRemodAdd
data_complete$HighQualSF <- data_complete$BsmtFinSF1 + data_complete$BsmtFinSF2 + data_complete$GrLivArea + data_complete$GarageArea + data_complete$WoodDeckSF + data_complete$OpenPorchSF
data_complete$TotalArea <- data_complete$GrLivArea+data_complete$TotalBsmtSF+data_complete$GarageArea+data_complete $LotArea+data_complete$MasVnrArea+data_complete$OpenPorchSF+data_complete$PoolArea+data_complete$ScreenPorch+data_complete$WoodDeckSF+data_complete$X3SsnPorch+data_complete$EnclosedPorch
data_complete$LogPrice <- log(data_complete$SalePrice)
data_complete <- data_complete[,-69] #get rid of SalePrice
Save the imputed data as future use
write.csv(data_complete, file = "data_complete.csv", row.names = FALSE)
PreProcess Test Dataset and Impute test dataset 1. Exterior1st and Exterior2nd are NA all together, so the reason for the missing data is that there is no covering for the house 2. MasVnrType and MasVnrArea are most of the times, NA together. This is probably caused by the fact that the masonry veneer is none
test <- read.csv("test.csv", stringsAsFactors = FALSE)
id <- test$Id
test <- test[,-1]
all_names <- colnames(data)
testdata <- test[,-c(3,6,57,58, 59, 60, 63,64,72, 73, 74)]
sapply(testdata, function(x) {sum(is.na(x))})
## MSSubClass MSZoning LotArea Street LotShape
## 0 4 0 0 0
## LandContour Utilities LotConfig LandSlope Neighborhood
## 0 2 0 0 0
## Condition1 Condition2 BldgType HouseStyle OverallQual
## 0 0 0 0 0
## OverallCond YearBuilt YearRemodAdd RoofStyle RoofMatl
## 0 0 0 0 0
## Exterior1st Exterior2nd MasVnrType MasVnrArea ExterQual
## 1 1 16 15 0
## ExterCond Foundation BsmtQual BsmtCond BsmtExposure
## 0 0 44 45 44
## BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF
## 42 1 42 1 1
## TotalBsmtSF Heating HeatingQC CentralAir Electrical
## 1 0 0 0 0
## X1stFlrSF X2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath
## 0 0 0 0 2
## BsmtHalfBath FullBath HalfBath BedroomAbvGr KitchenAbvGr
## 2 0 0 0 0
## KitchenQual TotRmsAbvGrd Functional Fireplaces GarageCars
## 1 0 2 0 1
## GarageArea PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch
## 1 0 0 0 0
## X3SsnPorch ScreenPorch PoolArea MiscVal MoSold
## 0 0 0 0 0
## YrSold SaleType SaleCondition
## 0 1 0
testdata[c(28, 889), "BsmtExposure"] <- 'Unf'
testdata[c(758, 759), "BsmtQual"] <- 'None'
testdata$BsmtFinType1[which(is.na(testdata$BsmtExposure))] <- 'None'
testdata$BsmtFinType2[which(is.na(testdata$BsmtExposure))] <- 'None'
testdata$BsmtQual[which(is.na(testdata$BsmtExposure))] <- 'None'
testdata$BsmtCond[which(is.na(testdata$BsmtExposure))] <- 'None'
testdata$BsmtFinSF1[which(is.na(testdata$BsmtExposure))] <- 0
testdata$BsmtFinSF2[which(is.na(testdata$BsmtExposure))] <- 0
testdata$BsmtUnfSF[which(is.na(testdata$BsmtExposure))] <- 0
testdata$BsmtExposure[which(is.na(testdata$BsmtExposure))] <- 'None'
testdata$MSZoning = as.factor(testdata$MSZoning)
testdata$Exterior1st[which(is.na(testdata$Exterior1st))] <- 'None'
testdata$Exterior2nd[which(is.na(testdata$Exterior2nd))] <- 'None'
testdata$MasVnrType[which(is.na(testdata$MasVnrArea))] <- 'None'
testdata$MasVnrArea[which(is.na(testdata$MasVnrArea))] <- 0
testdata$MasVnrType = as.factor(testdata$MasVnrType)
testdata$KitchenQual = as.factor(testdata$KitchenQual)
testdata$Functional = as.factor(testdata$Functional)
testdata$SaleType = as.factor(testdata$SaleType)
testdata$BsmtCond = as.factor(testdata$BsmtCond)
testdata$Utilities[which(is.na(testdata$Utilities))] = 'AllPub'
test_complete <- mice(testdata, m = 1, method = 'cart', printFlag = FALSE)
test_complete <- complete(test_complete)
TotalMissingNum <- sum(sapply(test_complete, function(x) {sum(is.na(x))}))
Add New Features into the test dataset
test_complete$BasementSF <- test_complete$BsmtFinSF1 + test_complete$BsmtFinSF2
test_complete$OneandTwoFloorSF <- test_complete$X1stFlrSF + test_complete$X2ndFlrSF
test_complete$FrontSF <- test_complete$WoodDeckSF + test_complete$OpenPorchSF
test_complete$BasementBath <- test_complete$BsmtFullBath + 0.5 * test_complete$BsmtHalfBath
test_complete$Bath <- test_complete$FullBath + 0.5 * test_complete$HalfBath
test_complete$Age <- test_complete$YrSold - test_complete$YearBuilt
test_complete$YrOfRemodel <- test_complete$YrSold - test_complete$YearRemodAdd
test_complete$HighQualSF <- test_complete$BsmtFinSF1 + test_complete$BsmtFinSF2 + test_complete$GrLivArea + test_complete$GarageArea + test_complete$WoodDeckSF + test_complete$OpenPorchSF
test_complete$TotalArea <- test_complete$GrLivArea+test_complete$TotalBsmtSF+test_complete$GarageArea+test_complete$LotArea+test_complete$MasVnrArea+test_complete$OpenPorchSF+test_complete$PoolArea+test_complete$ScreenPorch+test_complete$WoodDeckSF+test_complete$X3SsnPorch+test_complete$EnclosedPorch
write.csv(test_complete, file = 'test_complete.csv', row.names = FALSE)
Read the imputed data
data_complete <- read.csv("data_complete.csv", header = TRUE)
test_complete <- read.csv("test_complete.csv", header = TRUE)
complete = rbind(data_complete[,-78], test_complete)
all_data <- model.matrix(~., complete)
train_data <- all_data[1:1460,]
test_data <- all_data[1461:2919,]
Split data into test and train set and Generate Linear Regression Model (Since the common occuring ratio for train vs test is around 80 vs 20 which is also refered as Pareto Principle, in this case, the train set is 80% of original data while test set is the rest 20%)
library(glmnet)
## Loading required package: Matrix
## Loading required package: foreach
## Loaded glmnet 2.0-10
ind <- train_data
dep <- data_complete$LogPrice
set.seed(12345)
train.ind <- sample(1:dimension[1], dimension[1] * 0.8)
train_ind <- ind[train.ind, ]
train_dep <- dep[train.ind]
test_ind <- ind[-train.ind, ]
test_dep <- dep[-train.ind]
fit.lasso <- glmnet(x = train_ind, y = train_dep, alpha = 1)
fit.ridge <- glmnet(x = train_ind, y = train_dep, alpha = 0)
fit.elnet <- glmnet(x = train_ind, y = train_dep, alpha = 0.5)
plot(fit.lasso, main = "LASSO", xvar = "lambda")
plot(fit.ridge, main = "RIDGE", xvar = "lambda")
plot(fit.elnet, main = "ELASTIC NET", xvar = "lambda")
Find the best model with cross validation method 1. Train 10 models with training dataset, each with different alpha (different combination of LASSO and RIDGE, 0 ~ 1) 2. Predict the test dataset and compare the mean square error 3. Compare and find the best alpha value
fit0 <- cv.glmnet(x = train_ind, y = train_dep, type.measure = "mse", alpha = 0.0, family = "gaussian")
fit1 <- cv.glmnet(x = train_ind, y = train_dep, type.measure = "mse", alpha = 0.1, family = "gaussian")
fit2 <- cv.glmnet(x = train_ind, y = train_dep, type.measure = "mse", alpha = 0.2, family = "gaussian")
fit3 <- cv.glmnet(x = train_ind, y = train_dep, type.measure = "mse", alpha = 0.3, family = "gaussian")
fit4 <- cv.glmnet(x = train_ind, y = train_dep, type.measure = "mse", alpha = 0.4, family = "gaussian")
fit5 <- cv.glmnet(x = train_ind, y = train_dep, type.measure = "mse", alpha = 0.5, family = "gaussian")
fit6 <- cv.glmnet(x = train_ind, y = train_dep, type.measure = "mse", alpha = 0.6, family = "gaussian")
fit7 <- cv.glmnet(x = train_ind, y = train_dep, type.measure = "mse", alpha = 0.7, family = "gaussian")
fit8 <- cv.glmnet(x = train_ind, y = train_dep, type.measure = "mse", alpha = 0.8, family = "gaussian")
fit9 <- cv.glmnet(x = train_ind, y = train_dep, type.measure = "mse", alpha = 0.9, family = "gaussian")
fit10 <- cv.glmnet(x = train_ind, y = train_dep, type.measure = "mse", alpha = 1.0, family = "gaussian")
pred0 <- predict(fit0, s = fit0$lambda.1se, newx = test_ind)
pred1 <- predict(fit1, s = fit1$lambda.1se, newx = test_ind)
pred2 <- predict(fit2, s = fit2$lambda.1se, newx = test_ind)
pred3 <- predict(fit3, s = fit3$lambda.1se, newx = test_ind)
pred4 <- predict(fit4, s = fit4$lambda.1se, newx = test_ind)
pred5 <- predict(fit5, s = fit5$lambda.1se, newx = test_ind)
pred6 <- predict(fit6, s = fit6$lambda.1se, newx = test_ind)
pred7 <- predict(fit7, s = fit7$lambda.1se, newx = test_ind)
pred8 <- predict(fit8, s = fit8$lambda.1se, newx = test_ind)
pred9 <- predict(fit9, s = fit9$lambda.1se, newx = test_ind)
pred10 <- predict(fit10, s = fit10$lambda.1se, newx = test_ind)
mse0 <- mean((test_dep - pred0)^2)
mse1 <- mean((test_dep - pred1)^2)
mse2 <- mean((test_dep - pred2)^2)
mse3 <- mean((test_dep - pred3)^2)
mse4 <- mean((test_dep - pred4)^2)
mse5 <- mean((test_dep - pred5)^2)
mse6 <- mean((test_dep - pred6)^2)
mse7 <- mean((test_dep - pred7)^2)
mse8 <- mean((test_dep - pred8)^2)
mse9 <- mean((test_dep - pred9)^2)
mse10 <- mean((test_dep - pred10)^2)
Plot Alpha vs MSE
alpha <- seq(0, 1, 0.1)
mse <- c(mse0, mse1, mse2, mse3, mse4, mse5, mse6, mse7, mse8, mse9, mse10)
plot(alpha, mse, type='l', xlab="alpha", ylab="MSE")
text(alpha, mse, round(mse, 5), cex = 0.6, pos = 4, col = "red")
Choose the best alpha (choose alpha = 0.1) for the SalePrice Prediction The Score is 0.16360, which ranks 1654.
prediction <- predict(fit1, s = fit1$lambda.1se, newx = test_data)
df <- data.frame(cbind(id, exp(prediction)))
colnames(df) <- c('Id', 'SalePrice')
write.csv(df, file = "result.csv", row.names = FALSE)
Using all variables for Linear Regression The Reason for the NA values for some engineered features is that the engineered features are generated by other features with simple addition and substraction, which means: EngineeredFeatures = Feature1 + Feature2 + Feature3 + … Let’s assume: NewFeature = coef1 * Feature1 + coef2 * Feature2 + coef3 * Feature3 and the linear regression equation for the model is: Response = a * Feature1 + b * Feature2 + c * Feature3 + d * Feature4 + e * NewFeature The equation can be easily re-written into: Response = (a + e * coef1) * Feature1 + (b + e * coef2) * Feature2 + (c + e * coef3) * Feature3 + d * Feature4 So from this perspective: The newly generated Features are not useful for the model refinement
data_complete <- read.csv("data_complete.csv", header = TRUE)
test_complete <- read.csv("test_complete.csv", header = TRUE)
test_complete$LogPrice <- 0
complete = rbind(data_complete, test_complete)
all_data <- model.matrix(~., complete)
train_data <- all_data[1:1460,]
test_data <- all_data[1461:2919,]
train_feed1 <- data.frame(train_data)
test_feed1 <- data.frame(test_data)
lmModel1 <- lm(train_feed1$LogPrice ~ ., data = train_feed1)
summary(lmModel1)
##
## Call:
## lm(formula = train_feed1$LogPrice ~ ., data = train_feed1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.69632 -0.04735 0.00349 0.05295 0.69632
##
## Coefficients: (18 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.093e+00 4.777e+00 1.485 0.137818
## X.Intercept. NA NA NA NA
## MSSubClass -3.991e-04 3.781e-04 -1.056 0.291309
## MSZoningFV 4.405e-01 5.450e-02 8.083 1.48e-15 ***
## MSZoningRH 4.111e-01 5.419e-02 7.587 6.39e-14 ***
## MSZoningRL 4.131e-01 4.644e-02 8.895 < 2e-16 ***
## MSZoningRM 3.720e-01 4.347e-02 8.556 < 2e-16 ***
## LotArea 2.976e-06 4.936e-07 6.029 2.17e-09 ***
## StreetPave 1.166e-01 5.447e-02 2.141 0.032443 *
## LotShapeIR2 2.986e-02 1.914e-02 1.560 0.119063
## LotShapeIR3 1.906e-02 4.002e-02 0.476 0.633945
## LotShapeReg 4.951e-03 7.358e-03 0.673 0.501111
## LandContourHLS 3.072e-02 2.351e-02 1.306 0.191685
## LandContourLow -2.344e-02 2.926e-02 -0.801 0.423297
## LandContourLvl 2.798e-02 1.685e-02 1.661 0.096996 .
## UtilitiesNoSeWa -2.187e-01 1.184e-01 -1.847 0.065016 .
## LotConfigCulDSac 2.648e-02 1.462e-02 1.812 0.070272 .
## LotConfigFR2 -3.908e-02 1.831e-02 -2.135 0.032983 *
## LotConfigFR3 -9.870e-02 5.768e-02 -1.711 0.087294 .
## LotConfigInside -1.355e-02 7.987e-03 -1.696 0.090157 .
## LandSlopeMod 3.107e-02 1.815e-02 1.712 0.087101 .
## LandSlopeSev -1.952e-01 5.200e-02 -3.754 0.000182 ***
## NeighborhoodBlueste -5.018e-02 8.715e-02 -0.576 0.564832
## NeighborhoodBrDale -6.606e-02 5.006e-02 -1.319 0.187273
## NeighborhoodBrkSide 1.049e-02 4.282e-02 0.245 0.806585
## NeighborhoodClearCr 1.903e-02 4.209e-02 0.452 0.651175
## NeighborhoodCollgCr -2.086e-02 3.278e-02 -0.636 0.524608
## NeighborhoodCrawfor 1.073e-01 3.884e-02 2.762 0.005830 **
## NeighborhoodEdwards -8.568e-02 3.623e-02 -2.365 0.018202 *
## NeighborhoodGilbert -1.143e-02 3.511e-02 -0.325 0.744881
## NeighborhoodIDOTRR -3.006e-02 4.864e-02 -0.618 0.536739
## NeighborhoodMeadowV -1.672e-01 5.096e-02 -3.281 0.001063 **
## NeighborhoodMitchel -6.140e-02 3.709e-02 -1.655 0.098081 .
## NeighborhoodNAmes -3.810e-02 3.550e-02 -1.073 0.283432
## NeighborhoodNoRidge 3.368e-02 3.834e-02 0.878 0.379878
## NeighborhoodNPkVill -2.206e-03 6.376e-02 -0.035 0.972400
## NeighborhoodNridgHt 7.584e-02 3.368e-02 2.252 0.024519 *
## NeighborhoodNWAmes -4.252e-02 3.644e-02 -1.167 0.243574
## NeighborhoodOldTown -5.663e-02 4.361e-02 -1.299 0.194326
## NeighborhoodSawyer -2.636e-02 3.699e-02 -0.713 0.476212
## NeighborhoodSawyerW -8.628e-03 3.535e-02 -0.244 0.807196
## NeighborhoodSomerst 1.839e-02 4.094e-02 0.449 0.653413
## NeighborhoodStoneBr 1.305e-01 3.779e-02 3.453 0.000573 ***
## NeighborhoodSWISU -9.942e-04 4.377e-02 -0.023 0.981882
## NeighborhoodTimber 1.793e-04 3.705e-02 0.005 0.996141
## NeighborhoodVeenker 4.206e-02 4.787e-02 0.879 0.379788
## Condition1Feedr 2.265e-02 2.246e-02 1.008 0.313482
## Condition1Norm 7.374e-02 1.858e-02 3.970 7.61e-05 ***
## Condition1PosA 3.782e-02 4.536e-02 0.834 0.404534
## Condition1PosN 7.678e-02 3.365e-02 2.282 0.022665 *
## Condition1RRAe -5.077e-02 4.116e-02 -1.233 0.217720
## Condition1RRAn 3.012e-02 3.097e-02 0.972 0.331095
## Condition1RRNe 5.254e-03 8.061e-02 0.065 0.948043
## Condition1RRNn 8.366e-02 5.811e-02 1.440 0.150198
## Condition2Feedr 1.206e-01 1.014e-01 1.189 0.234490
## Condition2Norm 5.611e-02 8.660e-02 0.648 0.517121
## Condition2PosA 2.335e-01 1.673e-01 1.396 0.163096
## Condition2PosN -8.105e-01 1.219e-01 -6.648 4.45e-11 ***
## Condition2RRAe -5.341e-01 2.079e-01 -2.569 0.010306 *
## Condition2RRAn -2.762e-02 1.402e-01 -0.197 0.843789
## Condition2RRNn 2.919e-02 1.198e-01 0.244 0.807569
## BldgType2fmCon 4.890e-02 5.698e-02 0.858 0.390946
## BldgTypeDuplex -1.098e-02 3.314e-02 -0.331 0.740528
## BldgTypeTwnhs -5.434e-02 4.522e-02 -1.202 0.229721
## BldgTypeTwnhsE -7.864e-03 4.077e-02 -0.193 0.847087
## HouseStyle1.5Unf 3.944e-03 3.487e-02 0.113 0.909969
## HouseStyle1Story -3.268e-02 1.919e-02 -1.703 0.088828 .
## HouseStyle2.5Fin -5.205e-02 5.416e-02 -0.961 0.336692
## HouseStyle2.5Unf 5.967e-02 4.129e-02 1.445 0.148684
## HouseStyle2Story -1.658e-02 1.573e-02 -1.054 0.292068
## HouseStyleSFoyer -1.737e-02 2.829e-02 -0.614 0.539422
## HouseStyleSLvl -2.526e-03 2.483e-02 -0.102 0.918991
## OverallQual 4.411e-02 4.585e-03 9.621 < 2e-16 ***
## OverallCond 3.771e-02 3.958e-03 9.527 < 2e-16 ***
## YearBuilt 1.699e-03 3.370e-04 5.042 5.30e-07 ***
## YearRemodAdd 7.517e-04 2.491e-04 3.018 0.002596 **
## RoofStyleGable -1.365e-02 8.354e-02 -0.163 0.870189
## RoofStyleGambrel 2.194e-03 9.123e-02 0.024 0.980820
## RoofStyleHip -9.934e-03 8.378e-02 -0.119 0.905640
## RoofStyleMansard 5.550e-02 9.727e-02 0.571 0.568410
## RoofStyleShed 4.734e-01 1.582e-01 2.993 0.002818 **
## RoofMatlCompShg 2.603e+00 1.497e-01 17.392 < 2e-16 ***
## RoofMatlMembran 2.999e+00 2.177e-01 13.775 < 2e-16 ***
## RoofMatlMetal 2.860e+00 2.134e-01 13.404 < 2e-16 ***
## RoofMatlRoll 2.624e+00 1.882e-01 13.939 < 2e-16 ***
## RoofMatlTar.Grv 2.635e+00 1.725e-01 15.279 < 2e-16 ***
## RoofMatlWdShake 2.531e+00 1.656e-01 15.278 < 2e-16 ***
## RoofMatlWdShngl 2.701e+00 1.549e-01 17.437 < 2e-16 ***
## Exterior1stAsphShn 2.247e-02 1.513e-01 0.148 0.881992
## Exterior1stBrkComm -1.788e-01 1.258e-01 -1.421 0.155470
## Exterior1stBrkFace 1.120e-01 5.672e-02 1.975 0.048435 *
## Exterior1stCBlock -4.665e-02 1.235e-01 -0.378 0.705790
## Exterior1stCemntBd -6.269e-02 8.565e-02 -0.732 0.464356
## Exterior1stHdBoard 2.085e-02 5.731e-02 0.364 0.716004
## Exterior1stImStucc 7.525e-03 1.257e-01 0.060 0.952275
## Exterior1stMetalSd 6.372e-02 6.533e-02 0.975 0.329545
## Exterior1stPlywood 2.038e-02 5.665e-02 0.360 0.719148
## Exterior1stStone 8.705e-02 1.100e-01 0.791 0.428930
## Exterior1stStucco 5.388e-02 6.219e-02 0.866 0.386495
## Exterior1stVinylSd 2.468e-02 5.924e-02 0.417 0.677083
## Exterior1stWd.Sdng -9.551e-03 5.465e-02 -0.175 0.861293
## Exterior1stWdShing 2.510e-02 5.908e-02 0.425 0.671051
## Exterior1stNone NA NA NA NA
## Exterior2ndAsphShn 1.185e-03 1.005e-01 0.012 0.990589
## Exterior2ndBrk.Cmn 1.002e-02 9.114e-02 0.110 0.912456
## Exterior2ndBrkFace -5.371e-02 5.898e-02 -0.911 0.362636
## Exterior2ndCBlock NA NA NA NA
## Exterior2ndCmentBd 1.109e-01 8.436e-02 1.315 0.188681
## Exterior2ndHdBoard -7.326e-03 5.533e-02 -0.132 0.894685
## Exterior2ndImStucc 1.156e-02 6.376e-02 0.181 0.856123
## Exterior2ndMetalSd -2.099e-02 6.380e-02 -0.329 0.742201
## Exterior2ndOther -1.047e-01 1.246e-01 -0.841 0.400602
## Exterior2ndPlywood -3.483e-03 5.363e-02 -0.065 0.948233
## Exterior2ndStone -8.308e-02 7.703e-02 -1.079 0.281011
## Exterior2ndStucco -1.526e-02 6.004e-02 -0.254 0.799334
## Exterior2ndVinylSd 1.221e-02 5.724e-02 0.213 0.831077
## Exterior2ndWd.Sdng 3.178e-02 5.295e-02 0.600 0.548494
## Exterior2ndWd.Shng -1.393e-02 5.489e-02 -0.254 0.799711
## Exterior2ndNone NA NA NA NA
## MasVnrTypeBrkFace 3.372e-02 3.102e-02 1.087 0.277264
## MasVnrTypeNone 2.674e-02 3.128e-02 0.855 0.392765
## MasVnrTypeStone 4.229e-02 3.280e-02 1.289 0.197584
## MasVnrArea 1.032e-05 2.636e-05 0.392 0.695473
## ExterQualFa 2.861e-02 4.965e-02 0.576 0.564543
## ExterQualGd 1.037e-02 2.195e-02 0.473 0.636576
## ExterQualTA 1.554e-02 2.427e-02 0.640 0.522223
## ExterCondFa -9.787e-02 8.279e-02 -1.182 0.237418
## ExterCondGd -6.816e-02 7.912e-02 -0.862 0.389087
## ExterCondPo -1.026e-01 1.441e-01 -0.712 0.476602
## ExterCondTA -5.020e-02 7.894e-02 -0.636 0.524961
## FoundationCBlock 2.411e-02 1.436e-02 1.680 0.093280 .
## FoundationPConc 4.026e-02 1.549e-02 2.599 0.009453 **
## FoundationSlab -3.188e-02 4.567e-02 -0.698 0.485320
## FoundationStone 1.292e-01 4.918e-02 2.627 0.008731 **
## FoundationWood -1.184e-01 6.658e-02 -1.778 0.075613 .
## BsmtQualFa -3.438e-02 2.884e-02 -1.192 0.233428
## BsmtQualGd -2.744e-02 1.508e-02 -1.820 0.068924 .
## BsmtQualNone 1.532e-01 1.688e-01 0.908 0.364259
## BsmtQualTA -3.512e-02 1.866e-02 -1.882 0.060021 .
## BsmtCondGd 1.807e-02 2.394e-02 0.755 0.450447
## BsmtCondNone NA NA NA NA
## BsmtCondPo 3.218e-01 1.368e-01 2.353 0.018783 *
## BsmtCondTA 1.918e-02 1.912e-02 1.003 0.315985
## BsmtExposureGd 2.725e-02 1.375e-02 1.982 0.047744 *
## BsmtExposureMn -7.535e-03 1.382e-02 -0.545 0.585733
## BsmtExposureNo -1.172e-02 1.003e-02 -1.168 0.242975
## BsmtExposureNone -5.027e-02 1.064e-01 -0.472 0.636743
## BsmtExposureUnf NA NA NA NA
## BsmtFinType1BLQ -4.681e-03 1.259e-02 -0.372 0.710135
## BsmtFinType1GLQ 1.042e-02 1.152e-02 0.905 0.365842
## BsmtFinType1LwQ -2.367e-02 1.703e-02 -1.390 0.164831
## BsmtFinType1None NA NA NA NA
## BsmtFinType1Rec -6.503e-03 1.361e-02 -0.478 0.632988
## BsmtFinType1Unf -1.394e-02 1.333e-02 -1.046 0.295919
## BsmtFinSF1 1.394e-04 2.397e-05 5.815 7.69e-09 ***
## BsmtFinType2BLQ -7.004e-02 3.454e-02 -2.028 0.042819 *
## BsmtFinType2GLQ -1.893e-03 4.267e-02 -0.044 0.964615
## BsmtFinType2LwQ -3.667e-02 3.368e-02 -1.089 0.276432
## BsmtFinType2None -1.311e-01 1.155e-01 -1.135 0.256631
## BsmtFinType2Rec -2.760e-02 3.248e-02 -0.850 0.395688
## BsmtFinType2Unf -1.441e-02 3.454e-02 -0.417 0.676538
## BsmtFinSF2 1.377e-04 4.147e-05 3.320 0.000926 ***
## BsmtUnfSF 8.112e-05 2.188e-05 3.708 0.000218 ***
## TotalBsmtSF NA NA NA NA
## HeatingGasA 1.574e-01 1.159e-01 1.357 0.174879
## HeatingGasW 2.215e-01 1.191e-01 1.860 0.063190 .
## HeatingGrav 8.893e-03 1.256e-01 0.071 0.943550
## HeatingOthW 1.388e-01 1.430e-01 0.971 0.331808
## HeatingWall 2.582e-01 1.343e-01 1.922 0.054852 .
## HeatingQCFa -2.330e-02 2.136e-02 -1.091 0.275459
## HeatingQCGd -2.186e-02 9.435e-03 -2.316 0.020694 *
## HeatingQCPo -1.023e-01 1.225e-01 -0.835 0.404048
## HeatingQCTA -3.371e-02 9.394e-03 -3.588 0.000346 ***
## CentralAirY 6.738e-02 1.766e-02 3.814 0.000143 ***
## ElectricalFuseF -2.468e-04 2.644e-02 -0.009 0.992553
## ElectricalFuseP -3.970e-02 7.822e-02 -0.508 0.611828
## ElectricalMix -1.889e-01 1.832e-01 -1.031 0.302911
## ElectricalSBrkr -1.429e-02 1.347e-02 -1.061 0.288977
## X1stFlrSF 2.355e-04 2.526e-05 9.323 < 2e-16 ***
## X2ndFlrSF 2.113e-04 2.354e-05 8.977 < 2e-16 ***
## LowQualFinSF 1.905e-04 8.238e-05 2.312 0.020917 *
## GrLivArea NA NA NA NA
## BsmtFullBath 2.286e-02 9.026e-03 2.533 0.011446 *
## BsmtHalfBath 3.161e-04 1.383e-02 0.023 0.981766
## FullBath 1.931e-02 9.937e-03 1.943 0.052267 .
## HalfBath 2.142e-02 9.496e-03 2.255 0.024285 *
## BedroomAbvGr 5.560e-03 6.177e-03 0.900 0.368266
## KitchenAbvGr -3.857e-02 2.558e-02 -1.508 0.131865
## KitchenQualFa -5.759e-02 2.825e-02 -2.038 0.041713 *
## KitchenQualGd -6.342e-02 1.559e-02 -4.069 5.01e-05 ***
## KitchenQualTA -6.466e-02 1.767e-02 -3.658 0.000265 ***
## TotRmsAbvGrd 5.013e-03 4.309e-03 1.164 0.244812
## FunctionalMaj2 -2.313e-01 6.600e-02 -3.504 0.000475 ***
## FunctionalMin1 3.666e-02 3.924e-02 0.934 0.350385
## FunctionalMin2 3.063e-02 3.916e-02 0.782 0.434246
## FunctionalMod -6.043e-02 4.782e-02 -1.264 0.206551
## FunctionalSev -2.838e-01 1.265e-01 -2.243 0.025043 *
## FunctionalTyp 6.751e-02 3.395e-02 1.988 0.046988 *
## Fireplaces 2.422e-02 6.071e-03 3.989 7.01e-05 ***
## GarageCars 2.377e-02 9.805e-03 2.424 0.015490 *
## GarageArea 1.246e-04 3.384e-05 3.683 0.000241 ***
## PavedDriveP 1.573e-02 2.460e-02 0.639 0.522639
## PavedDriveY 2.345e-02 1.546e-02 1.517 0.129567
## WoodDeckSF 9.299e-05 2.647e-05 3.513 0.000460 ***
## OpenPorchSF 5.752e-05 5.249e-05 1.096 0.273303
## EnclosedPorch 1.234e-04 5.654e-05 2.183 0.029207 *
## X3SsnPorch 1.654e-04 1.019e-04 1.623 0.104752
## ScreenPorch 2.693e-04 5.547e-05 4.855 1.36e-06 ***
## PoolArea 1.613e-04 8.238e-05 1.959 0.050382 .
## MiscVal 9.047e-08 6.479e-06 0.014 0.988862
## MoSold -6.768e-04 1.120e-03 -0.604 0.545660
## YrSold -2.262e-03 2.350e-03 -0.963 0.335767
## SaleTypeCon 8.380e-02 8.086e-02 1.036 0.300236
## SaleTypeConLD 1.360e-01 4.413e-02 3.082 0.002104 **
## SaleTypeConLI -4.071e-02 5.236e-02 -0.778 0.436942
## SaleTypeConLw 9.097e-03 5.540e-02 0.164 0.869589
## SaleTypeCWD 6.371e-02 5.923e-02 1.076 0.282265
## SaleTypeNew 7.478e-02 7.106e-02 1.052 0.292831
## SaleTypeOth 6.229e-02 6.623e-02 0.940 0.347146
## SaleTypeWD -2.178e-02 1.914e-02 -1.138 0.255431
## SaleConditionAdjLand 1.073e-01 6.627e-02 1.619 0.105643
## SaleConditionAlloca 7.345e-02 3.900e-02 1.883 0.059885 .
## SaleConditionFamily 1.596e-02 2.786e-02 0.573 0.566971
## SaleConditionNormal 6.782e-02 1.318e-02 5.146 3.09e-07 ***
## SaleConditionPartial 1.788e-02 6.845e-02 0.261 0.793970
## BasementSF NA NA NA NA
## OneandTwoFloorSF NA NA NA NA
## FrontSF NA NA NA NA
## BasementBath NA NA NA NA
## Bath NA NA NA NA
## Age NA NA NA NA
## YrOfRemodel NA NA NA NA
## HighQualSF NA NA NA NA
## TotalArea NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1049 on 1244 degrees of freedom
## Multiple R-squared: 0.9412, Adjusted R-squared: 0.9311
## F-statistic: 92.65 on 215 and 1244 DF, p-value: < 2.2e-16
plot(lmModel1)
## Warning: not plotting observations with leverage one:
## 121, 251, 272, 333, 376, 399, 584, 596, 667, 945, 949, 1004, 1012, 1188, 1231, 1271, 1276, 1299, 1322, 1371
## Warning: not plotting observations with leverage one:
## 121, 251, 272, 333, 376, 399, 584, 596, 667, 945, 949, 1004, 1012, 1188, 1231, 1271, 1276, 1299, 1322, 1371
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
Take out the variables that are most significant and do the linear regression again
train_data <- data_complete[, c(2, 3, 10, 11, 12, 15, 16, 17, 20, 32, 34, 35, 38, 39, 41, 42, 45, 51, 53, 54, 58, 62, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78)]
test_data <- test_complete[, c(2, 3, 10, 11, 12, 15, 16, 17, 20, 32, 34, 35, 38, 39, 41, 42, 45, 51, 53, 54, 58, 62, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78)]
complete <- rbind(train_data, test_data)
all_data <- model.matrix(~., complete)
train_feed2 <- data.frame(all_data[1:1460, ])
test_feed2 <- data.frame(all_data[1461:2919, ])
lmModel2 <- lm(train_feed2$LogPrice ~ ., data = train_feed2)
summary(lmModel2)
##
## Call:
## lm(formula = train_feed2$LogPrice ~ ., data = train_feed2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.73397 -0.05346 0.00243 0.05756 0.73397
##
## Coefficients: (3 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.803e+00 4.657e+00 1.246 0.213024
## X.Intercept. NA NA NA NA
## MSZoningFV 3.649e-01 5.328e-02 6.849 1.12e-11 ***
## MSZoningRH 3.500e-01 5.327e-02 6.570 7.13e-11 ***
## MSZoningRL 3.736e-01 4.462e-02 8.372 < 2e-16 ***
## MSZoningRM 3.183e-01 4.196e-02 7.585 6.09e-14 ***
## LotArea -3.304e-05 1.884e-05 -1.754 0.079645 .
## NeighborhoodBlueste -5.450e-02 8.630e-02 -0.631 0.527825
## NeighborhoodBrDale -1.116e-01 4.483e-02 -2.491 0.012871 *
## NeighborhoodBrkSide 5.553e-02 3.719e-02 1.493 0.135664
## NeighborhoodClearCr 5.660e-02 3.825e-02 1.480 0.139234
## NeighborhoodCollgCr 2.063e-02 2.953e-02 0.699 0.484965
## NeighborhoodCrawfor 1.479e-01 3.517e-02 4.205 2.78e-05 ***
## NeighborhoodEdwards -3.385e-02 3.273e-02 -1.034 0.301182
## NeighborhoodGilbert 4.142e-02 3.133e-02 1.322 0.186295
## NeighborhoodIDOTRR 2.987e-02 4.270e-02 0.699 0.484414
## NeighborhoodMeadowV -1.555e-01 4.403e-02 -3.533 0.000425 ***
## NeighborhoodMitchel -2.168e-02 3.339e-02 -0.649 0.516330
## NeighborhoodNAmes 7.881e-03 3.132e-02 0.252 0.801330
## NeighborhoodNoRidge 5.259e-02 3.492e-02 1.506 0.132284
## NeighborhoodNPkVill -3.213e-02 4.819e-02 -0.667 0.505044
## NeighborhoodNridgHt 8.038e-02 3.147e-02 2.554 0.010751 *
## NeighborhoodNWAmes -1.200e-02 3.280e-02 -0.366 0.714628
## NeighborhoodOldTown -2.501e-04 3.801e-02 -0.007 0.994750
## NeighborhoodSawyer 5.545e-03 3.316e-02 0.167 0.867230
## NeighborhoodSawyerW 5.176e-03 3.246e-02 0.159 0.873326
## NeighborhoodSomerst 5.698e-02 3.788e-02 1.504 0.132793
## NeighborhoodStoneBr 1.233e-01 3.632e-02 3.396 0.000703 ***
## NeighborhoodSWISU 2.516e-02 4.021e-02 0.626 0.531597
## NeighborhoodTimber 3.073e-02 3.410e-02 0.901 0.367688
## NeighborhoodVeenker 8.350e-02 4.493e-02 1.858 0.063351 .
## Condition1Feedr 1.771e-02 2.232e-02 0.794 0.427480
## Condition1Norm 6.647e-02 1.829e-02 3.635 0.000288 ***
## Condition1PosA 5.474e-02 4.558e-02 1.201 0.229936
## Condition1PosN 7.462e-02 3.352e-02 2.226 0.026166 *
## Condition1RRAe -4.987e-02 4.186e-02 -1.191 0.233746
## Condition1RRAn 3.938e-02 3.092e-02 1.273 0.203107
## Condition1RRNe 4.677e-02 8.298e-02 0.564 0.573048
## Condition1RRNn 4.750e-02 5.575e-02 0.852 0.394287
## Condition2Feedr 1.126e-01 9.697e-02 1.162 0.245598
## Condition2Norm 7.517e-02 8.249e-02 0.911 0.362371
## Condition2PosA 2.306e-01 1.418e-01 1.626 0.104119
## Condition2PosN -8.847e-01 1.202e-01 -7.361 3.13e-13 ***
## Condition2RRAe -7.722e-02 1.405e-01 -0.550 0.582635
## Condition2RRAn -4.596e-02 1.417e-01 -0.324 0.745706
## Condition2RRNn 5.794e-02 1.167e-01 0.496 0.619626
## OverallQual 5.442e-02 4.239e-03 12.839 < 2e-16 ***
## OverallCond 3.705e-02 3.626e-03 10.217 < 2e-16 ***
## YearBuilt 7.967e-04 2.315e-03 0.344 0.730815
## RoofMatlCompShg 2.791e+00 1.309e-01 21.319 < 2e-16 ***
## RoofMatlMembran 3.056e+00 1.794e-01 17.032 < 2e-16 ***
## RoofMatlMetal 2.875e+00 1.751e-01 16.417 < 2e-16 ***
## RoofMatlRoll 2.760e+00 1.711e-01 16.132 < 2e-16 ***
## RoofMatlTar.Grv 2.813e+00 1.360e-01 20.678 < 2e-16 ***
## RoofMatlWdShake 2.774e+00 1.415e-01 19.600 < 2e-16 ***
## RoofMatlWdShngl 2.859e+00 1.372e-01 20.833 < 2e-16 ***
## BsmtFinSF1 -3.064e-05 2.471e-05 -1.240 0.215116
## BsmtFinSF2 -6.366e-05 2.992e-05 -2.128 0.033538 *
## BsmtUnfSF 5.634e-05 2.403e-05 2.344 0.019221 *
## HeatingQCFa -3.736e-02 1.922e-02 -1.944 0.052070 .
## HeatingQCGd -2.208e-02 9.445e-03 -2.338 0.019546 *
## HeatingQCPo -2.249e-03 1.205e-01 -0.019 0.985105
## HeatingQCTA -3.527e-02 9.036e-03 -3.903 9.95e-05 ***
## CentralAirY 7.002e-02 1.517e-02 4.617 4.26e-06 ***
## X1stFlrSF 6.519e-05 2.843e-05 2.293 0.021998 *
## X2ndFlrSF 5.560e-05 2.426e-05 2.292 0.022068 *
## BsmtFullBath 1.306e-02 2.594e-02 0.504 0.614626
## KitchenQualFa -9.053e-02 2.638e-02 -3.432 0.000616 ***
## KitchenQualGd -6.538e-02 1.435e-02 -4.557 5.66e-06 ***
## KitchenQualTA -7.231e-02 1.676e-02 -4.315 1.71e-05 ***
## FunctionalMaj2 -1.629e-01 6.011e-02 -2.710 0.006816 **
## FunctionalMin1 6.772e-02 3.753e-02 1.804 0.071386 .
## FunctionalMin2 7.184e-02 3.714e-02 1.934 0.053280 .
## FunctionalMod -3.628e-02 4.379e-02 -0.829 0.407524
## FunctionalSev -3.334e-01 1.238e-01 -2.693 0.007165 **
## FunctionalTyp 1.066e-01 3.180e-02 3.353 0.000822 ***
## Fireplaces 2.656e-02 6.024e-03 4.409 1.12e-05 ***
## WoodDeckSF 9.812e-06 5.691e-05 0.172 0.863128
## ScreenPorch 2.124e-04 5.870e-05 3.619 0.000307 ***
## SaleConditionAdjLand 1.104e-01 5.972e-02 1.849 0.064718 .
## SaleConditionAlloca 4.383e-02 3.650e-02 1.201 0.230108
## SaleConditionFamily 6.564e-03 2.785e-02 0.236 0.813727
## SaleConditionNormal 7.119e-02 1.221e-02 5.828 6.97e-09 ***
## SaleConditionPartial 1.264e-01 1.706e-02 7.409 2.22e-13 ***
## BasementSF NA NA NA NA
## OneandTwoFloorSF NA NA NA NA
## FrontSF -1.118e-04 5.477e-05 -2.041 0.041423 *
## BasementBath 9.121e-03 2.717e-02 0.336 0.737129
## Bath 1.389e-02 9.405e-03 1.477 0.140023
## Age -1.313e-03 2.304e-03 -0.570 0.568775
## YrOfRemodel -6.994e-04 2.383e-04 -2.934 0.003397 **
## HighQualSF 1.629e-04 2.733e-05 5.961 3.18e-09 ***
## TotalArea 3.498e-05 1.883e-05 1.857 0.063484 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1114 on 1370 degrees of freedom
## Multiple R-squared: 0.927, Adjusted R-squared: 0.9223
## F-statistic: 195.5 on 89 and 1370 DF, p-value: < 2.2e-16
plot(lmModel2)
## Warning: not plotting observations with leverage one:
## 326, 584, 667, 1004, 1231, 1276, 1299
## Warning: not plotting observations with leverage one:
## 326, 584, 667, 1004, 1231, 1276, 1299
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
From the Four plots, it is very clear that the point 826 and 524 are very unusual and it might be outliers because of the high cook’s distance and also high leverage as well as high residuals. They can be classified as outliers with high leverage and high residuals. It has very high influential power towards the result of the model. With or without these two outliers, the results can be quite different.
train_feed3 <- train_feed2[-c(524, 826),]
lmModel3 <- lm(train_feed3$LogPrice~., data = train_feed3)
summary(lmModel3)
##
## Call:
## lm(formula = train_feed3$LogPrice ~ ., data = train_feed3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.69474 -0.05243 0.00152 0.05744 0.50044
##
## Coefficients: (4 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.457e+00 4.499e+00 1.213 0.225377
## X.Intercept. NA NA NA NA
## MSZoningFV 3.684e-01 5.148e-02 7.157 1.34e-12 ***
## MSZoningRH 3.521e-01 5.146e-02 6.841 1.18e-11 ***
## MSZoningRL 3.757e-01 4.311e-02 8.715 < 2e-16 ***
## MSZoningRM 3.189e-01 4.054e-02 7.866 7.37e-15 ***
## LotArea -3.054e-05 1.820e-05 -1.678 0.093552 .
## NeighborhoodBlueste -4.547e-02 8.338e-02 -0.545 0.585586
## NeighborhoodBrDale -1.033e-01 4.331e-02 -2.385 0.017197 *
## NeighborhoodBrkSide 6.657e-02 3.595e-02 1.852 0.064277 .
## NeighborhoodClearCr 5.260e-02 3.696e-02 1.423 0.154906
## NeighborhoodCollgCr 1.786e-02 2.853e-02 0.626 0.531355
## NeighborhoodCrawfor 1.523e-01 3.398e-02 4.482 8.01e-06 ***
## NeighborhoodEdwards -2.037e-02 3.165e-02 -0.644 0.519912
## NeighborhoodGilbert 3.909e-02 3.026e-02 1.292 0.196663
## NeighborhoodIDOTRR 4.216e-02 4.127e-02 1.022 0.307191
## NeighborhoodMeadowV -1.491e-01 4.254e-02 -3.505 0.000472 ***
## NeighborhoodMitchel -2.116e-02 3.226e-02 -0.656 0.512023
## NeighborhoodNAmes 1.246e-02 3.026e-02 0.412 0.680482
## NeighborhoodNoRidge 3.949e-02 3.376e-02 1.170 0.242320
## NeighborhoodNPkVill -2.662e-02 4.656e-02 -0.572 0.567581
## NeighborhoodNridgHt 6.311e-02 3.045e-02 2.073 0.038402 *
## NeighborhoodNWAmes -1.231e-02 3.169e-02 -0.388 0.697785
## NeighborhoodOldTown 9.786e-03 3.673e-02 0.266 0.789951
## NeighborhoodSawyer 1.054e-02 3.204e-02 0.329 0.742364
## NeighborhoodSawyerW 3.113e-03 3.136e-02 0.099 0.920919
## NeighborhoodSomerst 5.109e-02 3.660e-02 1.396 0.163020
## NeighborhoodStoneBr 1.171e-01 3.509e-02 3.337 0.000869 ***
## NeighborhoodSWISU 3.182e-02 3.885e-02 0.819 0.412967
## NeighborhoodTimber 2.545e-02 3.294e-02 0.772 0.439961
## NeighborhoodVeenker 8.338e-02 4.341e-02 1.921 0.054953 .
## Condition1Feedr 1.736e-02 2.156e-02 0.805 0.420809
## Condition1Norm 6.638e-02 1.767e-02 3.757 0.000179 ***
## Condition1PosA 4.985e-02 4.404e-02 1.132 0.257832
## Condition1PosN 6.783e-02 3.239e-02 2.094 0.036432 *
## Condition1RRAe -5.299e-02 4.044e-02 -1.310 0.190301
## Condition1RRAn 3.646e-02 2.988e-02 1.220 0.222543
## Condition1RRNe 4.231e-02 8.016e-02 0.528 0.597681
## Condition1RRNn 4.601e-02 5.385e-02 0.854 0.393097
## Condition2Feedr 1.184e-01 9.368e-02 1.264 0.206498
## Condition2Norm 7.626e-02 7.969e-02 0.957 0.338795
## Condition2PosA 2.123e-01 1.370e-01 1.549 0.121516
## Condition2PosN NA NA NA NA
## Condition2RRAe -9.479e-02 1.357e-01 -0.698 0.485044
## Condition2RRAn -5.037e-02 1.369e-01 -0.368 0.712959
## Condition2RRNn 6.629e-02 1.127e-01 0.588 0.556636
## OverallQual 5.336e-02 4.096e-03 13.027 < 2e-16 ***
## OverallCond 3.723e-02 3.503e-03 10.627 < 2e-16 ***
## YearBuilt 9.061e-04 2.237e-03 0.405 0.685437
## RoofMatlCompShg 2.897e+00 1.269e-01 22.828 < 2e-16 ***
## RoofMatlMembran 3.182e+00 1.738e-01 18.308 < 2e-16 ***
## RoofMatlMetal 2.990e+00 1.696e-01 17.635 < 2e-16 ***
## RoofMatlRoll 2.860e+00 1.656e-01 17.273 < 2e-16 ***
## RoofMatlTar.Grv 2.914e+00 1.318e-01 22.108 < 2e-16 ***
## RoofMatlWdShake 2.879e+00 1.371e-01 20.993 < 2e-16 ***
## RoofMatlWdShngl 2.957e+00 1.329e-01 22.241 < 2e-16 ***
## BsmtFinSF1 -1.202e-05 2.394e-05 -0.502 0.615698
## BsmtFinSF2 -4.878e-05 2.894e-05 -1.686 0.092106 .
## BsmtUnfSF 6.306e-05 2.323e-05 2.715 0.006713 **
## HeatingQCFa -3.575e-02 1.856e-02 -1.926 0.054316 .
## HeatingQCGd -2.141e-02 9.125e-03 -2.346 0.019100 *
## HeatingQCPo 5.038e-03 1.164e-01 0.043 0.965472
## HeatingQCTA -3.479e-02 8.730e-03 -3.985 7.10e-05 ***
## CentralAirY 7.081e-02 1.465e-02 4.833 1.50e-06 ***
## X1stFlrSF 9.012e-05 2.758e-05 3.268 0.001112 **
## X2ndFlrSF 8.106e-05 2.358e-05 3.438 0.000603 ***
## BsmtFullBath 1.455e-02 2.506e-02 0.581 0.561545
## KitchenQualFa -9.101e-02 2.548e-02 -3.572 0.000367 ***
## KitchenQualGd -6.593e-02 1.386e-02 -4.757 2.17e-06 ***
## KitchenQualTA -7.314e-02 1.619e-02 -4.517 6.81e-06 ***
## FunctionalMaj2 -1.497e-01 5.808e-02 -2.578 0.010049 *
## FunctionalMin1 7.607e-02 3.627e-02 2.098 0.036127 *
## FunctionalMin2 7.843e-02 3.588e-02 2.186 0.028999 *
## FunctionalMod -3.640e-02 4.230e-02 -0.860 0.389703
## FunctionalSev -3.400e-01 1.196e-01 -2.843 0.004542 **
## FunctionalTyp 1.155e-01 3.074e-02 3.757 0.000179 ***
## Fireplaces 2.398e-02 5.825e-03 4.117 4.06e-05 ***
## WoodDeckSF -3.575e-05 5.516e-05 -0.648 0.517018
## ScreenPorch 2.073e-04 5.671e-05 3.655 0.000267 ***
## SaleConditionAdjLand 1.059e-01 5.769e-02 1.836 0.066618 .
## SaleConditionAlloca 4.225e-02 3.526e-02 1.198 0.231036
## SaleConditionFamily 5.921e-03 2.691e-02 0.220 0.825880
## SaleConditionNormal 7.242e-02 1.180e-02 6.137 1.10e-09 ***
## SaleConditionPartial 1.281e-01 1.649e-02 7.773 1.50e-14 ***
## BasementSF NA NA NA NA
## OneandTwoFloorSF NA NA NA NA
## FrontSF -5.704e-05 5.320e-05 -1.072 0.283844
## BasementBath 5.251e-03 2.625e-02 0.200 0.841457
## Bath 1.094e-02 9.090e-03 1.203 0.229010
## Age -1.388e-03 2.226e-03 -0.623 0.533183
## YrOfRemodel -6.950e-04 2.303e-04 -3.019 0.002586 **
## HighQualSF 1.543e-04 2.642e-05 5.841 6.47e-09 ***
## TotalArea 3.262e-05 1.820e-05 1.793 0.073229 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1076 on 1369 degrees of freedom
## Multiple R-squared: 0.9317, Adjusted R-squared: 0.9273
## F-statistic: 212.3 on 88 and 1369 DF, p-value: < 2.2e-16
plot(lmModel3)
## Warning: not plotting observations with leverage one:
## 272, 326, 583, 666, 1002, 1229, 1274, 1297
## Warning: not plotting observations with leverage one:
## 272, 326, 583, 666, 1002, 1229, 1274, 1297
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
Compare the Coefficients with(lmModel2) or without(lmModel3) the 2 potential outliers (826 and 524). From the result, it can be seen that some of the coefficients really change a lot (some even reach 80%, which is a lot)
library(qpcR)
## Loading required package: MASS
## Loading required package: minpack.lm
## Loading required package: rgl
## Loading required package: robustbase
coefMatrix <- qpcR:::cbind.na(coef(lmModel2), coef(lmModel3))
coefFrame <- data.frame(coefMatrix)
colnames(coefFrame) <- c("With_Outliers", "Without_Outliers")
coefFrame$ChangePercent <- ifelse(coefFrame$Without_Outliers == 0 | coefFrame$With_Outliers == 0, NA, abs((coefFrame$Without_Outliers - coefFrame$With_Outliers) / coefFrame$Without_Outliers))
coefFrame
## With_Outliers Without_Outliers ChangePercent
## (Intercept) 5.802523e+00 5.457334e+00 0.063252353
## X.Intercept. NA NA NA
## MSZoningFV 3.649421e-01 3.684343e-01 0.009478521
## MSZoningRH 3.500045e-01 3.520675e-01 0.005859735
## MSZoningRL 3.735709e-01 3.756993e-01 0.005665049
## MSZoningRM 3.182831e-01 3.188667e-01 0.001830113
## LotArea -3.304269e-05 -3.054191e-05 0.081880043
## NeighborhoodBlueste -5.449867e-02 -4.547136e-02 0.198527345
## NeighborhoodBrDale -1.116499e-01 -1.033227e-01 0.080594059
## NeighborhoodBrkSide 5.553098e-02 6.656711e-02 0.165789533
## NeighborhoodClearCr 5.659732e-02 5.259807e-02 0.076034234
## NeighborhoodCollgCr 2.062999e-02 1.786393e-02 0.154840555
## NeighborhoodCrawfor 1.478859e-01 1.522841e-01 0.028881523
## NeighborhoodEdwards -3.385192e-02 -2.036974e-02 0.661872399
## NeighborhoodGilbert 4.142402e-02 3.909474e-02 0.059580394
## NeighborhoodIDOTRR 2.986572e-02 4.215798e-02 0.291576069
## NeighborhoodMeadowV -1.555254e-01 -1.490745e-01 0.043272583
## NeighborhoodMitchel -2.167774e-02 -2.115731e-02 0.024598199
## NeighborhoodNAmes 7.881489e-03 1.246246e-02 0.367581717
## NeighborhoodNoRidge 5.258912e-02 3.948794e-02 0.331776816
## NeighborhoodNPkVill -3.213402e-02 -2.662211e-02 0.207042656
## NeighborhoodNridgHt 8.037693e-02 6.310861e-02 0.273628689
## NeighborhoodNWAmes -1.199636e-02 -1.230746e-02 0.025277117
## NeighborhoodOldTown -2.501480e-04 9.785906e-03 1.025562066
## NeighborhoodSawyer 5.545489e-03 1.053541e-02 0.473633302
## NeighborhoodSawyerW 5.175668e-03 3.113401e-03 0.662384114
## NeighborhoodSomerst 5.698168e-02 5.108897e-02 0.115342269
## NeighborhoodStoneBr 1.233484e-01 1.171142e-01 0.053231133
## NeighborhoodSWISU 2.516076e-02 3.181628e-02 0.209185964
## NeighborhoodTimber 3.072531e-02 2.544860e-02 0.207347768
## NeighborhoodVeenker 8.349868e-02 8.338408e-02 0.001374275
## Condition1Feedr 1.771357e-02 1.736042e-02 0.020342532
## Condition1Norm 6.647108e-02 6.637618e-02 0.001429806
## Condition1PosA 5.474452e-02 4.984826e-02 0.098223228
## Condition1PosN 7.462238e-02 6.782891e-02 0.100155990
## Condition1RRAe -4.987018e-02 -5.299316e-02 0.058931784
## Condition1RRAn 3.937796e-02 3.645922e-02 0.080054857
## Condition1RRNe 4.677366e-02 4.231327e-02 0.105413400
## Condition1RRNn 4.750322e-02 4.600635e-02 0.032536218
## Condition2Feedr 1.126415e-01 1.183981e-01 0.048620576
## Condition2Norm 7.516557e-02 7.625750e-02 0.014319068
## Condition2PosA 2.306267e-01 2.122835e-01 0.086408787
## Condition2PosN -8.847304e-01 NA NA
## Condition2RRAe -7.721693e-02 -9.478760e-02 0.185368866
## Condition2RRAn -4.595932e-02 -5.036556e-02 0.087485102
## Condition2RRNn 5.793708e-02 6.628642e-02 0.125958579
## OverallQual 5.442154e-02 5.336399e-02 0.019817590
## OverallCond 3.704648e-02 3.722643e-02 0.004834073
## YearBuilt 7.966508e-04 9.061082e-04 0.120799553
## RoofMatlCompShg 2.791111e+00 2.897357e+00 0.036669871
## RoofMatlMembran 3.056176e+00 3.182010e+00 0.039545363
## RoofMatlMetal 2.874703e+00 2.990259e+00 0.038644207
## RoofMatlRoll 2.759706e+00 2.859959e+00 0.035054187
## RoofMatlTar.Grv 2.812830e+00 2.913899e+00 0.034685092
## RoofMatlWdShake 2.773803e+00 2.878671e+00 0.036429229
## RoofMatlWdShngl 2.858828e+00 2.956575e+00 0.033060867
## BsmtFinSF1 -3.063859e-05 -1.201921e-05 1.549135009
## BsmtFinSF2 -6.365829e-05 -4.878289e-05 0.304930628
## BsmtUnfSF 5.633787e-05 6.306460e-05 0.106664082
## HeatingQCFa -3.736130e-02 -3.575467e-02 0.044934952
## HeatingQCGd -2.208017e-02 -2.141009e-02 0.031297073
## HeatingQCPo -2.249046e-03 5.037986e-03 1.446417699
## HeatingQCTA -3.527001e-02 -3.478845e-02 0.013842430
## CentralAirY 7.002059e-02 7.081335e-02 0.011195029
## X1stFlrSF 6.519201e-05 9.011868e-05 0.276598282
## X2ndFlrSF 5.559914e-05 8.106059e-05 0.314103861
## BsmtFullBath 1.306430e-02 1.455307e-02 0.102299613
## KitchenQualFa -9.053365e-02 -9.100839e-02 0.005216479
## KitchenQualGd -6.537625e-02 -6.593252e-02 0.008436949
## KitchenQualTA -7.231225e-02 -7.313849e-02 0.011296897
## FunctionalMaj2 -1.628810e-01 -1.497195e-01 0.087907740
## FunctionalMin1 6.772371e-02 7.607367e-02 0.109761565
## FunctionalMin2 7.183728e-02 7.843453e-02 0.084111545
## FunctionalMod -3.628051e-02 -3.639894e-02 0.003253706
## FunctionalSev -3.334320e-01 -3.399889e-01 0.019285517
## FunctionalTyp 1.066228e-01 1.154708e-01 0.076625815
## Fireplaces 2.655747e-02 2.398441e-02 0.107280663
## WoodDeckSF 9.812040e-06 -3.575290e-05 1.274440431
## ScreenPorch 2.124378e-04 2.072564e-04 0.024999973
## SaleConditionAdjLand 1.104008e-01 1.059069e-01 0.042431716
## SaleConditionAlloca 4.382500e-02 4.225328e-02 0.037197531
## SaleConditionFamily 6.564367e-03 5.920782e-03 0.108699171
## SaleConditionNormal 7.118894e-02 7.241642e-02 0.016950244
## SaleConditionPartial 1.264246e-01 1.281413e-01 0.013397075
## BasementSF NA NA NA
## OneandTwoFloorSF NA NA NA
## FrontSF -1.117981e-04 -5.703506e-05 0.960165124
## BasementBath 9.120642e-03 5.251275e-03 0.736843416
## Bath 1.388665e-02 1.093968e-02 0.269383349
## Age -1.313474e-03 -1.387582e-03 0.053408510
## YrOfRemodel -6.994120e-04 -6.950417e-04 0.006287863
## HighQualSF 1.629287e-04 1.543048e-04 0.055888385
## TotalArea 3.498281e-05 3.262412e-05 0.072298894
Predict the Test dataset with the linear regression model (lmModel1, lmModel2 and lmModel3) The score for lmModel1 is 0.13617, which ranks 1181 (All variables considered) The score for lmModel2 is 0.13101, which ranks 1044 (Only variables with high importance and newly engineered features) The score for lmModel3 is 0.12559, which ranks 846 (Only variables with high important and newly engineered features, Exclude outliers)
prediction1 <- predict(lmModel1, test_feed1)
## Warning in predict.lm(lmModel1, test_feed1): prediction from a rank-
## deficient fit may be misleading
prediction2 <- predict(lmModel2, test_feed2)
## Warning in predict.lm(lmModel2, test_feed2): prediction from a rank-
## deficient fit may be misleading
prediction3 <- predict(lmModel3, test_feed2)
## Warning in predict.lm(lmModel3, test_feed2): prediction from a rank-
## deficient fit may be misleading
df1 <- data.frame(cbind(id, exp(prediction1)))
colnames(df1) <- c('Id', 'SalePrice')
df2 <- data.frame(cbind(id, exp(prediction2)))
colnames(df2) <- c('Id', 'SalePrice')
df3 <- data.frame(cbind(id, exp(prediction3)))
colnames(df3) <- c('Id', 'SalePrice')
write.csv(df1, "result1.csv", row.names = FALSE)
write.csv(df2, "result2.csv", row.names = FALSE)
write.csv(df3, "result3.csv", row.names = FALSE)
From this, it can be concluded: 1. With less variables involved in the model, the performance has improved 2. With less outliers, the performance has improved